import urllib
import os
# http://europa.eu/rapid/pressReleasesAction.do?reference=IP/09/1274&format...


# PROBLEM -- JOHN DIDN'T INCLUDE OTHER TYPES OF CONTENT
# http://europa.eu/rapid/pressReleasesAction.do?reference=PESC/"
# THis is where the praise for kyrgyzstan joingin OPCAT was expressed!!
# http://europa.eu/rapid/pressReleasesAction.do?reference=PESC/10/10&format=HTML&aged=0&language=EN&guiLanguage=en


gen1 = "http://europa.eu/rapid/pressReleasesAction.do?reference=IP/"
gen1PESC = "http://europa.eu/rapid/pressReleasesAction.do?reference=PESC/"
gen2 = "&format=HTML&aged=0&language=EN&guiLanguage=en"
#newpath = "C:\Documents and Settings\John Sheffield\Desktop\Simmons RA\EuroData"
newpath = "C:\\Documents and Settings\\Rich\\Desktop\\Rewards for Ratification\\text analysis\\EuroData14jul2010"

year = ['85','86','87','88','89','90','91','92','93','94','95','96','97','98'\
        ,'99','00','01','02','03','04','05','06','07','08','09','10']
#year = ['09','10']
dnum = range(1,10000)

### Turns 3 into 0003 and leaves others intact
def mk4dig(num):
    addzero = "000"+str(num)
    numlen = len(addzero)
    newnum = addzero[numlen-4:numlen]
    return newnum

def isItGood(candFile):
    if candFile.count('<meta name="date" content="Not Available">') > 0:
        #candFile.close()
        return False
    else:
        #candFile.close()
        return True




    # loop for the "PESC" class of documents
for yr in year:
    print 'Beginning year ' + yr + ': PESC'
    countfail = 0
    for docnum in dnum:
        if countfail <= 10:
            myfile = urllib.urlopen(gen1PESC + yr + "/" + str(docnum) + gen2)
            fileToSave = myfile.read()
            if(isItGood(fileToSave)):
                newFile = open(os.path.join(newpath, yr + "-" + mk4dig(docnum) + "PESC.txt"), 'w')
                newFile.write(fileToSave)
                newFile.close()
            else:
                countfail = countfail + 1
        else:
            break 




# loop for the "IP" class of documents
for yr in year:
    print 'Beginning year ' + yr + ': IP'
    countfail = 0
    for docnum in dnum:
        if countfail <= 10:
            myfile = urllib.urlopen(gen1 + yr + "/" + str(docnum) + gen2)
            fileToSave = myfile.read()
            if(isItGood(fileToSave)):
                newFile = open(os.path.join(newpath, yr + "-" + mk4dig(docnum) + "IP.txt"), 'w')
                newFile.write(fileToSave)
                newFile.close()
            else:
                countfail = countfail + 1
        else:
            break


